Exam dna2.fasta Please Note: No Grace Period

Welcome to the final exam.
If you haven't yet read the instructions, you can do so here.
Please run the following data set in the program(s) that you have written:
dna2.fasta
If you created your program(s) correctly, you will be able to answer the questions below.



In [1]:

    
import os.path
dna2_fasta_file_name = "dna2.fasta"
dna2_fasta_file_path = "./data"
dna2_fasta_file_directory = os.path.join(dna2_fasta_file_path, dna2_fasta_file_name)
print "dna2_fasta_file_directory:%s" % dna2_fasta_file_directory

# open file and attain data from file
try: 
    f = open(dna2_fasta_file_directory, "r")
except Exception as e: 
    print e
    
try: 
    data = f.read()
    # read = > str, readlines = > list, readline = > str
    print "read data from %s successfully." % dna2_fasta_file_directory
except Exception as e: 
    print e









    



dna2_fasta_file_directory:./data/dna2.fasta
read data from ./data/dna2.fasta successfully.

Question 1

How many records are in the multi-FASTA file?



In [2]:

    
# str.count function
s = "asdfdfsas"
print s.count("s")



In [3]:

    
# count record in variable "data" according to the special symbol ">"(each record has this symbol)
record_num = data.count(">")
print "record_num:%s" % record_num

# close file
try: 
    f.close()
    print "close %s file successfully." % dna2_fasta_file_directory
except Exception as e:
    print e









    



record_num:18
close ./data/dna2.fasta file successfully.

Question 2

What is the length of the longest sequence in the file?

10457
461
4894
2341



In [4]:

    
# open file and load data
try:
    f = open(dna2_fasta_file_directory, "r")
    data_string_list = f.readlines()
    print "open %s file and load data successfully." % dna2_fasta_file_directory
except Exception as e:
    print e









    



open ./data/dna2.fasta file and load data successfully.



In [5]:

    
# generate each record string as a element in list from data_string_list
record_list = []
record_meta_list = []
for string_idx in xrange(len(data_string_list)):
    string = data_string_list[string_idx]
    if string.count(">") == 1:
        record_meta_list.append(string)
        
        if string_idx != 0:
            record_list.append(cur_record)
            
        cur_record = ""
        continue
    else:
        cur_record = cur_record + string
        if string_idx == len(data_string_list) -  1:
            record_list.append(cur_record)
            
print "record num.:%s" % len(record_list)
print "len(record_meta_list):%s" % len(record_meta_list)









    



record num.:18
len(record_meta_list):18



In [6]:

    
# remove special symbols
s = "asadafsdaafs"
print s.replace("a", "")









    



sdfsdfs



In [7]:

    
# remove the LINE BREAK character "\n" in variable record_list(each element is a string)
record_list = map(lambda record_string: record_string.replace("\n", ""), record_list)



In [8]:

    
def add(s1, s2):
    return s1+s2

s1_list = range(1, 5, 1) # [1, 2, 3, 4]
s2_list = range(-1, -5, -1)
print s1_list
print s2_list
print map(add, s1_list, s2_list)









    



[1, 2, 3, 4]
[-1, -2, -3, -4]
[0, 0, 0, 0]



In [9]:

    
record_len_list = map(lambda record_string: len(record_string), record_list)
max_len_record_length = max(record_len_list)
each_record_len_list = map(lambda idx, length: (idx, length), xrange(1, 19), record_len_list)

print "len(record_len_list):%s" % len(record_len_list)
print "record_len_list:%s" % record_len_list
print "max_len_record_length:%s" % max_len_record_length
print "each_record_len_list:%s" % each_record_len_list









    



len(record_len_list):18
record_len_list:[4635, 1151, 4894, 3511, 4076, 2867, 442, 890, 967, 4338, 1352, 4564, 4804, 964, 2095, 1432, 115, 2646]
max_len_record_length:4894
each_record_len_list:[(1, 4635), (2, 1151), (3, 4894), (4, 3511), (5, 4076), (6, 2867), (7, 442), (8, 890), (9, 967), (10, 4338), (11, 1352), (12, 4564), (13, 4804), (14, 964), (15, 2095), (16, 1432), (17, 115), (18, 2646)]

Question 3

What is the length of the shortest sequence in the file?



In [10]:

    
min_len_record_length = min(record_len_list)
print "min_len_record_length:%s" % min_len_record_length









    



min_len_record_length:115

Question 4

What is the length of the longest ORF appearing in reading frame 2 of any of the sequences?

1644
1560
1458
1401



In [11]:

    
s = "asdfa"
print s[3:].find("z")
print s[3:].index("z")









    



-1






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-55129b50642e> in <module>()
      1 s = "asdfa"
      2 print s[3:].find("z")
----> 3 print s[3:].index("z")

ValueError: substring not found



In [12]:

    
a = 0
while 1:
    a += 1
    print a
    if a == 3: break



In [13]:

    
def get_sequence_string_accroding_2_frame_num(sequence_string, frame_num):
    # generate the sequence string corresponding to frame num.
    if frame_num == 1:
        pass
    elif frame_num == 2:
        sequence_string = sequence_string[1:]
    elif frame_num == 3:
        sequence_string = sequence_string[2:]
    return sequence_string

record_list_according_2_frame_2 = map(
    lambda record: get_sequence_string_accroding_2_frame_num(
        sequence_string = record,
        frame_num = 6
    ), record_list
)



In [14]:

    
def get_Ngram_list(sequence_string, gramN):
    # generate Ngram list
    # default gramN = 3, trigram
    Ngram_num = len(sequence_string) / gramN
    Ngram_list = map(
        lambda start_index: sequence_string[start_index:start_index + gramN], xrange(Ngram_num)
    )
    return Ngram_list

Ngram_2d_list = map(
    lambda record: get_Ngram_list(
        sequence_string = record,
        gramN = 3
    ), record_list_according_2_frame_2
)
print map(len, record_list_according_2_frame_2)









    



[4635, 1151, 4894, 3511, 4076, 2867, 442, 890, 967, 4338, 1352, 4564, 4804, 964, 2095, 1432, 115, 2646]



In [15]:

    
def find_length_of_longgest_ORF(Ngram_sequence_list):
    def find_first_target_index_in_list(trigram_string_list, target_string):
        target_index_in_list = -1
        try: 
            target_index_in_list = Ngram_sequence_list.index(target_string)
        except Exception as e:
            return target_index_in_list
        
    
    start_codon = "ATG"
    end_codon_list = ["TAA", "TAG", "TGA"]
    cur_ORF_list = []
    cur_max_length = 0
    cur_max_length_index = -1
    cur_start_index = 0
    cur_end_index = -1
    while 1:
        # find start inedx
        try: 
            cur_start_index = Ngram_sequence_list[cur_start_index:].index(start_codon)
        except Exception as e:
            print e
            return cur_max_length 
        
        # find end index
        end_index_list = map(
            lambda end_codon: find_first_target_index_in_list(Ngram_sequence_list,
                                                              end_codon), 
            end_codon_list
        )
        cur_end_index = min(end_index_list)
        if cur_end_index == -1: return cur_max_length

        # current ORF
        cur_ORF_list = Ngram_sequence_list[cur_start_index: cur_end_index]
        
        # update cur_max_length variable
        if cur_max_length <= len(Ngram_sequence_list[cur_start_index: cur_end_index]):
            cur_max_length = len(Ngram_sequence_list[cur_start_index: cur_end_index])
            cur_max_length_index = cur_start_index

        # exchange end index to start index
        cur_start_index = cur_end_index
        cur_end_index = -1
GG



In [ ]:

    
ORF_max_length_for_each_record_list = map(
    lambda Ngram_sequence_list: find_length_of_longgest_ORF(Ngram_sequence_list),
    Ngram_2d_list
)
print len(ORF_max_length_for_each_record_list)



In [30]:

    
help(list)









    



Help on class list in module __builtin__:

class list(object)
 |  list() -> new empty list
 |  list(iterable) -> new list initialized from iterable's items
 |  
 |  Methods defined here:
 |  
 |  __add__(...)
 |      x.__add__(y) <==> x+y
 |  
 |  __contains__(...)
 |      x.__contains__(y) <==> y in x
 |  
 |  __delitem__(...)
 |      x.__delitem__(y) <==> del x[y]
 |  
 |  __delslice__(...)
 |      x.__delslice__(i, j) <==> del x[i:j]
 |      
 |      Use of negative indices is not supported.
 |  
 |  __eq__(...)
 |      x.__eq__(y) <==> x==y
 |  
 |  __ge__(...)
 |      x.__ge__(y) <==> x>=y
 |  
 |  __getattribute__(...)
 |      x.__getattribute__('name') <==> x.name
 |  
 |  __getitem__(...)
 |      x.__getitem__(y) <==> x[y]
 |  
 |  __getslice__(...)
 |      x.__getslice__(i, j) <==> x[i:j]
 |      
 |      Use of negative indices is not supported.
 |  
 |  __gt__(...)
 |      x.__gt__(y) <==> x>y
 |  
 |  __iadd__(...)
 |      x.__iadd__(y) <==> x+=y
 |  
 |  __imul__(...)
 |      x.__imul__(y) <==> x*=y
 |  
 |  __init__(...)
 |      x.__init__(...) initializes x; see help(type(x)) for signature
 |  
 |  __iter__(...)
 |      x.__iter__() <==> iter(x)
 |  
 |  __le__(...)
 |      x.__le__(y) <==> x<=y
 |  
 |  __len__(...)
 |      x.__len__() <==> len(x)
 |  
 |  __lt__(...)
 |      x.__lt__(y) <==> x<y
 |  
 |  __mul__(...)
 |      x.__mul__(n) <==> x*n
 |  
 |  __ne__(...)
 |      x.__ne__(y) <==> x!=y
 |  
 |  __repr__(...)
 |      x.__repr__() <==> repr(x)
 |  
 |  __reversed__(...)
 |      L.__reversed__() -- return a reverse iterator over the list
 |  
 |  __rmul__(...)
 |      x.__rmul__(n) <==> n*x
 |  
 |  __setitem__(...)
 |      x.__setitem__(i, y) <==> x[i]=y
 |  
 |  __setslice__(...)
 |      x.__setslice__(i, j, y) <==> x[i:j]=y
 |      
 |      Use  of negative indices is not supported.
 |  
 |  __sizeof__(...)
 |      L.__sizeof__() -- size of L in memory, in bytes
 |  
 |  append(...)
 |      L.append(object) -- append object to end
 |  
 |  count(...)
 |      L.count(value) -> integer -- return number of occurrences of value
 |  
 |  extend(...)
 |      L.extend(iterable) -- extend list by appending elements from the iterable
 |  
 |  index(...)
 |      L.index(value, [start, [stop]]) -> integer -- return first index of value.
 |      Raises ValueError if the value is not present.
 |  
 |  insert(...)
 |      L.insert(index, object) -- insert object before index
 |  
 |  pop(...)
 |      L.pop([index]) -> item -- remove and return item at index (default last).
 |      Raises IndexError if list is empty or index is out of range.
 |  
 |  remove(...)
 |      L.remove(value) -- remove first occurrence of value.
 |      Raises ValueError if the value is not present.
 |  
 |  reverse(...)
 |      L.reverse() -- reverse *IN PLACE*
 |  
 |  sort(...)
 |      L.sort(cmp=None, key=None, reverse=False) -- stable sort *IN PLACE*;
 |      cmp(x, y) -> -1, 0, 1
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __hash__ = None
 |  
 |  __new__ = <built-in method __new__ of type object>
 |      T.__new__(S, ...) -> a new object with type S, a subtype of T

Question 5

What is the starting position of the longest ORF in reading frame 3 in any of the sequences? The position should indicate the character number where the ORF begins. For instance, the following ORF:

sequence1
ATGCCCTAG
starts at position 1.

758
2338
636
832



In [16]:

    
def get_sequence_string_accroding_2_frame_num(sequence_string, frame_num):
    # generate the sequence string corresponding to frame num.
    if frame_num == 1:
        pass
    elif frame_num == 2:
        sequence_string = sequence_string[1:]
    elif frame_num == 3:
        sequence_string = sequence_string[2:]
    return sequence_string

record_list_according_2_frame3 = map(
    lambda sequence_string: get_sequence_string_accroding_2_frame_num(
        sequence_string,
        frame_num = 3
    ), record_list
)



In [17]:

    
def get_Ngram_list(sequence_string, gramN):
    # generate Ngram list
    # default gramN = 3, trigram
    Ngram_num = len(sequence_string) / gramN
    Ngram_list = map(
        lambda start_index: sequence_string[start_index:start_index + gramN], xrange(Ngram_num)
    )
    return Ngram_list

Ngram_2d_list = map(
    lambda record: get_Ngram_list(
        sequence_string = record,
        gramN = 3
    ), record_list_according_2_frame3
)
print map(len, record_list_according_2_frame3)









    



[4633, 1149, 4892, 3509, 4074, 2865, 440, 888, 965, 4336, 1350, 4562, 4802, 962, 2093, 1430, 113, 2644]

Question 6

What is the length of the longest ORF appearing in any sequence and in any forward reading frame?

1719
294
2307
1560

Question 7

What is the length of the longest forward ORF that appears in the sequence with the identifier gi|142022655|gb|EQ086233.1|16?

1509
1458
1644
1317

Question 8

Find the most frequently occurring repeat of length 6 in all sequences. How many times does it occur in all?



In [ ]:

    
def generate_len_six_string_list(string):
    segment_length = 6
    len_six_string_list = []
    for idx in xrange(len(string) - segment_length):
        len_six_string_list.append(string[idx:idx + segment_length])
    return len_six_string_list



In [ ]:

    
# flatten 2-Dimension list variable
from compiler.ast import flatten
li = [[1, 2], [3], [4, 5, 2]]
print "li:%s" % li
print "flatten(li):%s" % flatten(li)



In [ ]:

    
# generate any all 6 length strings for each record from variable "record_list"
length6_2d_list = map(generate_len_six_string_list, record_list)
length6_list = flatten(length6_2d_list)
print "len(length6_list):%s" % len(length6_list)
length6_set = set(length6_list)
print "len(length6_set):%s" % len(length6_set)



In [ ]:

    
# statistic about most frequency string of length 6
length6_dict = dict()
for cur_length6_string in length6_set:
    cur_length6_string_exist_list = map(lambda length6_string_in_length6_list: length6_string_in_length6_list.count(cur_length6_string), length6_list)
    cur_length6_string_frequency = sum(cur_length6_string_exist_list)
    length6_dict[cur_length6_string] = cur_length6_string_frequency



In [ ]:

    
most_frequency_length6_value = max(length6_dict.values())
print "most_frequency_length6_value:%s" % most_frequency_length6_value

Question 9

Find all repeats of length 12 in the input file. Let's use Max to specify the number of copies of the most frequent repeat of length 12. How many different 12-base sequences occur Max times?



In [ ]:

    
def generate_user_defined_length_string_list(string, segment_length):
    len_six_string_list = []
    for idx in xrange(len(string) - segment_length):
        len_six_string_list.append(string[idx:idx + segment_length])
    return len_six_string_list



In [ ]:

    
segment_length = 12
# generate any all 12 length strings for each record from variable "record_list"
length12_2d_list = map(
    lambda string: generate_user_defined_length_string_list(string, segment_length), record_list)
length12_list = flatten(length12_2d_list)
print "len(length12_list):%s" % len(length12_list)
length12_set = set(length12_list)
print "len(length12_set):%s" % len(length12_set)

length12_and_length_tuple_list = map(lambda length12: (length12, len(length12)), length12_set)
sorted_length12_and_length_tuple_list = sorted(length12_and_length_tuple_list, key = lambda tup: tup[1])
most_frequency_length12_and_length_tuple = sorted_length12_and_length_tuple_list[0]
print "most_frequency_length12_and_length_tuple:%s" % str(most_frequency_length12_and_length_tuple)
print "most_frequency_length12_and_length_tuple[0:2]:%s" % str(most_frequency_length12_and_length_tuple[0:2])

most_frequency_length12 = most_frequency_length12_and_length_tuple[0]
print "most_frequency_length12:%s" % most_frequency_length12

most_frequency_lenght12_count_in_each_record_list = map(
    lambda record: record.count(most_frequency_length12), record_list)
print "most_frequency_lenght12_count_in_each_record_list:%s" % most_frequency_lenght12_count_in_each_record_list

most_frequency_length12_count_sum = sum(most_frequency_lenght12_count_in_each_record_list)
print "most_frequency_length12_count_sum:%s" % most_frequency_length12_count_sum

Question 10

Which one of the following repeats of length 7 has a maximum number of occurrences?

CATCGCC
GCGCGCA
TGCGCGC
CGCGCCG



In [ ]:

    
# CATCGCC
pattern_string = "CATCGCC"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)



In [ ]:

    
# GCGCGCA
pattern_string = "GCGCGCA"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)



In [ ]:

    
# TGCGCGC
pattern_string = "TGCGCGC"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)



In [ ]:

    
# CGCGCCG
pattern_string = "CGCGCCG"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)